GVPT Maths Camp

Data Transformation

Learning objectives

  1. Learn basic operations in R

  2. Be introduced to dplyr

  3. Clean up and transform your data

R as a calculator


1 / 200 * 30
[1] 0.15


(59 + 73 + 2) / 3
[1] 44.66667


sin(pi / 2)
[1] 1

R objects

Create new objects with <-

x <- 3 * 4

x
[1] 12


x <- 3 * 10

x
[1] 30

R functions

Many functions come with R straight out of the box:

seq(1, 10)
 [1]  1  2  3  4  5  6  7  8  9 10


You can create objects using functions:

x <- seq(1, 10)

x
 [1]  1  2  3  4  5  6  7  8  9 10

Introduction to data transformation

Accessing gapminder data

First, you need to install the gapminder package:

install.packages("gapminder")

Then access the gapminder data set:

library(gapminder)

head(gapminder)
# A tibble: 6 × 6
  country     continent  year lifeExp      pop gdpPercap
  <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
1 Afghanistan Asia       1952    28.8  8425333      779.
2 Afghanistan Asia       1957    30.3  9240934      821.
3 Afghanistan Asia       1962    32.0 10267083      853.
4 Afghanistan Asia       1967    34.0 11537966      836.
5 Afghanistan Asia       1972    36.1 13079460      740.
6 Afghanistan Asia       1977    38.4 14880372      786.

Data types

In gapminder:

  • fctr stands for factors, which R uses to represent categorical variables with fixed possible values.

  • int stands for integer.

  • dbl stands for doubles (or real numbers).

Data types

Other types:

  • chr stands for character vectors, or strings.

  • dttm stands for date-times (a date + a time).

  • lgl stands for logical, vectors that contain only TRUE or FALSE.1

Introducing dplyr

Help you with most of your data transformation needs.

Five basic functions:

  • filter()

  • arrange()

  • select()

  • mutate()

  • summarise()

Filter rows with filter()

library(dplyr)

filter(gapminder, country == "Australia", year > 2000)
# A tibble: 2 × 6
  country   continent  year lifeExp      pop gdpPercap
  <fct>     <fct>     <int>   <dbl>    <int>     <dbl>
1 Australia Oceania    2002    80.4 19546792    30688.
2 Australia Oceania    2007    81.2 20434176    34435.

Filter rows with filter()

library(dplyr)

filter(gapminder, continent %in% c("Asia", "Oceania"))
# A tibble: 420 × 6
   country     continent  year lifeExp      pop gdpPercap
   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
 1 Afghanistan Asia       1952    28.8  8425333      779.
 2 Afghanistan Asia       1957    30.3  9240934      821.
 3 Afghanistan Asia       1962    32.0 10267083      853.
 4 Afghanistan Asia       1967    34.0 11537966      836.
 5 Afghanistan Asia       1972    36.1 13079460      740.
 6 Afghanistan Asia       1977    38.4 14880372      786.
 7 Afghanistan Asia       1982    39.9 12881816      978.
 8 Afghanistan Asia       1987    40.8 13867957      852.
 9 Afghanistan Asia       1992    41.7 16317921      649.
10 Afghanistan Asia       1997    41.8 22227415      635.
# ℹ 410 more rows

Filter rows with filter()

library(dplyr)

filter(gapminder, pop > 500000 & pop < 1000000)
# A tibble: 88 × 6
   country  continent  year lifeExp    pop gdpPercap
   <fct>    <fct>     <int>   <dbl>  <int>     <dbl>
 1 Bahrain  Asia       1992    72.6 529491    19036.
 2 Bahrain  Asia       1997    73.9 598561    20292.
 3 Bahrain  Asia       2002    74.8 656397    23404.
 4 Bahrain  Asia       2007    75.6 708573    29796.
 5 Botswana Africa     1962    51.5 512764      984.
 6 Botswana Africa     1967    53.3 553541     1215.
 7 Botswana Africa     1972    56.0 619351     2264.
 8 Botswana Africa     1977    59.3 781472     3215.
 9 Botswana Africa     1982    61.5 970347     4551.
10 Comoros  Africa     1997    60.7 527982     1174.
# ℹ 78 more rows

Filter rows with filter()

library(dplyr)

filter(gapminder, pop > 500000 | pop < 1000000)
# A tibble: 1,704 × 6
   country     continent  year lifeExp      pop gdpPercap
   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
 1 Afghanistan Asia       1952    28.8  8425333      779.
 2 Afghanistan Asia       1957    30.3  9240934      821.
 3 Afghanistan Asia       1962    32.0 10267083      853.
 4 Afghanistan Asia       1967    34.0 11537966      836.
 5 Afghanistan Asia       1972    36.1 13079460      740.
 6 Afghanistan Asia       1977    38.4 14880372      786.
 7 Afghanistan Asia       1982    39.9 12881816      978.
 8 Afghanistan Asia       1987    40.8 13867957      852.
 9 Afghanistan Asia       1992    41.7 16317921      649.
10 Afghanistan Asia       1997    41.8 22227415      635.
# ℹ 1,694 more rows

Arrange rows with arrange()

library(dplyr)

arrange(gapminder, country, year)
# A tibble: 1,704 × 6
   country     continent  year lifeExp      pop gdpPercap
   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
 1 Afghanistan Asia       1952    28.8  8425333      779.
 2 Afghanistan Asia       1957    30.3  9240934      821.
 3 Afghanistan Asia       1962    32.0 10267083      853.
 4 Afghanistan Asia       1967    34.0 11537966      836.
 5 Afghanistan Asia       1972    36.1 13079460      740.
 6 Afghanistan Asia       1977    38.4 14880372      786.
 7 Afghanistan Asia       1982    39.9 12881816      978.
 8 Afghanistan Asia       1987    40.8 13867957      852.
 9 Afghanistan Asia       1992    41.7 16317921      649.
10 Afghanistan Asia       1997    41.8 22227415      635.
# ℹ 1,694 more rows

Arrange rows with arrange()

library(dplyr)

arrange(gapminder, country, desc(year))
# A tibble: 1,704 × 6
   country     continent  year lifeExp      pop gdpPercap
   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
 1 Afghanistan Asia       2007    43.8 31889923      975.
 2 Afghanistan Asia       2002    42.1 25268405      727.
 3 Afghanistan Asia       1997    41.8 22227415      635.
 4 Afghanistan Asia       1992    41.7 16317921      649.
 5 Afghanistan Asia       1987    40.8 13867957      852.
 6 Afghanistan Asia       1982    39.9 12881816      978.
 7 Afghanistan Asia       1977    38.4 14880372      786.
 8 Afghanistan Asia       1972    36.1 13079460      740.
 9 Afghanistan Asia       1967    34.0 11537966      836.
10 Afghanistan Asia       1962    32.0 10267083      853.
# ℹ 1,694 more rows

Select columns with select()

library(dplyr)

select(gapminder, country, year, pop)
# A tibble: 1,704 × 3
   country      year      pop
   <fct>       <int>    <int>
 1 Afghanistan  1952  8425333
 2 Afghanistan  1957  9240934
 3 Afghanistan  1962 10267083
 4 Afghanistan  1967 11537966
 5 Afghanistan  1972 13079460
 6 Afghanistan  1977 14880372
 7 Afghanistan  1982 12881816
 8 Afghanistan  1987 13867957
 9 Afghanistan  1992 16317921
10 Afghanistan  1997 22227415
# ℹ 1,694 more rows

Select columns with select()

library(dplyr)

select(gapminder, country:pop)
# A tibble: 1,704 × 5
   country     continent  year lifeExp      pop
   <fct>       <fct>     <int>   <dbl>    <int>
 1 Afghanistan Asia       1952    28.8  8425333
 2 Afghanistan Asia       1957    30.3  9240934
 3 Afghanistan Asia       1962    32.0 10267083
 4 Afghanistan Asia       1967    34.0 11537966
 5 Afghanistan Asia       1972    36.1 13079460
 6 Afghanistan Asia       1977    38.4 14880372
 7 Afghanistan Asia       1982    39.9 12881816
 8 Afghanistan Asia       1987    40.8 13867957
 9 Afghanistan Asia       1992    41.7 16317921
10 Afghanistan Asia       1997    41.8 22227415
# ℹ 1,694 more rows

Select columns with select()

library(dplyr)

select(gapminder, -(lifeExp:pop))
# A tibble: 1,704 × 4
   country     continent  year gdpPercap
   <fct>       <fct>     <int>     <dbl>
 1 Afghanistan Asia       1952      779.
 2 Afghanistan Asia       1957      821.
 3 Afghanistan Asia       1962      853.
 4 Afghanistan Asia       1967      836.
 5 Afghanistan Asia       1972      740.
 6 Afghanistan Asia       1977      786.
 7 Afghanistan Asia       1982      978.
 8 Afghanistan Asia       1987      852.
 9 Afghanistan Asia       1992      649.
10 Afghanistan Asia       1997      635.
# ℹ 1,694 more rows

Add new variables with mutate()

library(dplyr)

mutate(gapminder, gdp = gdpPercap * pop)
# A tibble: 1,704 × 7
   country     continent  year lifeExp      pop gdpPercap          gdp
   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>        <dbl>
 1 Afghanistan Asia       1952    28.8  8425333      779.  6567086330.
 2 Afghanistan Asia       1957    30.3  9240934      821.  7585448670.
 3 Afghanistan Asia       1962    32.0 10267083      853.  8758855797.
 4 Afghanistan Asia       1967    34.0 11537966      836.  9648014150.
 5 Afghanistan Asia       1972    36.1 13079460      740.  9678553274.
 6 Afghanistan Asia       1977    38.4 14880372      786. 11697659231.
 7 Afghanistan Asia       1982    39.9 12881816      978. 12598563401.
 8 Afghanistan Asia       1987    40.8 13867957      852. 11820990309.
 9 Afghanistan Asia       1992    41.7 16317921      649. 10595901589.
10 Afghanistan Asia       1997    41.8 22227415      635. 14121995875.
# ℹ 1,694 more rows

Select, transform, and add new variables with transmute()

library(dplyr)

transmute(gapminder, country, year, gdp = gdpPercap * pop)
# A tibble: 1,704 × 3
   country      year          gdp
   <fct>       <int>        <dbl>
 1 Afghanistan  1952  6567086330.
 2 Afghanistan  1957  7585448670.
 3 Afghanistan  1962  8758855797.
 4 Afghanistan  1967  9648014150.
 5 Afghanistan  1972  9678553274.
 6 Afghanistan  1977 11697659231.
 7 Afghanistan  1982 12598563401.
 8 Afghanistan  1987 11820990309.
 9 Afghanistan  1992 10595901589.
10 Afghanistan  1997 14121995875.
# ℹ 1,694 more rows

Create summaries with summarise()

library(dplyr)

summarise(
  gapminder, avg_pop = mean(pop), avg_gdp_per_cap = mean(gdpPercap)
)
# A tibble: 1 × 2
    avg_pop avg_gdp_per_cap
      <dbl>           <dbl>
1 29601212.           7215.

Creating grouped summaries with group_by() and summarise()

library(dplyr)

gapminder_continent <- group_by(gapminder, continent)

summarise(
  gapminder_continent, avg_pop = mean(pop), avg_gdp_per_cap = mean(gdpPercap)
)
# A tibble: 5 × 3
  continent   avg_pop avg_gdp_per_cap
  <fct>         <dbl>           <dbl>
1 Africa     9916003.           2194.
2 Americas  24504795.           7136.
3 Asia      77038722.           7902.
4 Europe    17169765.          14469.
5 Oceania    8874672.          18622.

Combine multiple operations with the pipe

library(dplyr)

gapminder |> 
  group_by(continent) |> 
  summarise(avg_pop = mean(pop), avg_gdp_per_cap = mean(gdpPercap)) |> 
  arrange(avg_gdp_per_cap)
# A tibble: 5 × 3
  continent   avg_pop avg_gdp_per_cap
  <fct>         <dbl>           <dbl>
1 Africa     9916003.           2194.
2 Americas  24504795.           7136.
3 Asia      77038722.           7902.
4 Europe    17169765.          14469.
5 Oceania    8874672.          18622.

Combine multiple operations with the pipe

library(dplyr)
library(ggplot2)

gapminder |> 
  group_by(continent) |> 
  summarise(avg_pop = mean(pop), avg_gdp_per_cap = mean(gdpPercap)) |> 
  ggplot(aes(x = continent, y = avg_gdp_per_cap)) + 
  geom_col() + 
  theme_minimal()

A note on the pipe

Base pipe:

  • |>

  • Can be used without loading any packages

Tidyverse pipe:

  • %>%

  • Must load dplyr or magrittr to use

A handy shortcut for counting data with count()

library(dplyr)

gapminder |> 
  distinct(continent, country) |> 
  group_by(continent) |> 
  summarise(n_countries = n())
# A tibble: 5 × 2
  continent n_countries
  <fct>           <int>
1 Africa             52
2 Americas           25
3 Asia               33
4 Europe             30
5 Oceania             2
library(dplyr)

gapminder |> 
  distinct(continent, country) |> 
  count(continent)
# A tibble: 5 × 2
  continent     n
  <fct>     <int>
1 Africa       52
2 Americas     25
3 Asia         33
4 Europe       30
5 Oceania       2

Summary

This morning you have:

  1. Learnt R basic syntax

  2. Learnt how to transform your data